#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Created by master on 2017/4/18
import random
import time

from yinhang.HtmlDownLoader import HtmlDownloader
from bs4 import BeautifulSoup
import re
import xlwt
from yinhang.Save2Excel import Save


class Parser(object):
    host = 'http://www.pbc.gov.cn'

    def __init__(self):
        pass

    def parser(self, html):
        soup = BeautifulSoup(html, "lxml")
        list_a = []
        for a in soup.find(id='zwgk_rlist').find_all('a'):
            if BeautifulSoup(str(a), 'lxml').find('a').has_attr('href'):
                list_a.append(self.host + a['href'])
        return list_a

    def packge_url(self, index):
        return (
            'http://www.pbc.gov.cn/zhengwugongkai/127924/128041/2951606/1923625/1923629/d6d180ae/index%s.html' % index)


if __name__ == '__main__':
    wb = xlwt.Workbook()
    ws = wb.add_sheet('company info')
    downloader = HtmlDownloader()
    p = Parser()
    count = 0
    for i in range(1, 15):
        # time.sleep(random.randint(0, 5))
        html = downloader.download(p.packge_url(i))
        list_url = p.parser(html)
        for j in range(len(list_url)):
            # time.sleep(random.randint(0, 5))
            Save.save(list_url[j], count, ws)
            count += 1
    wb.save('公司信息.xls')
